## packages: remove or add your necessary packages

required_packages <- c("tidyverse", "readxl", "ggthemes", "hrbrthemes", "extrafont", "plotly", "scales", "stringr", "gganimate", "here", "tidytext", "sentimentr", "scales", "DT", "here", "sm", "mblm", "glue", "fs", "knitr", "rmdformats", "janitor", "urltools", "colorspace", "pdftools")

library(ggplot2)    # CRAN v3.3.6
library(colorspace) # CRAN v2.0-3
library(here)       # CRAN v1.0.1
library(dplyr)      # CRAN v1.0.10
library(janitor)    # CRAN v2.1.0
library(gt)         # CRAN v0.5.0
library(tidyr)      # CRAN v1.2.1
library(readr)      # CRAN v2.1.3
library(stringr)    # CRAN v1.4.1
library(tidytext)
library(ggalt)
library(forcats)
library(lubridate)
library(ggforce)

# for(i in required_packages) { 
# if(!require(i, character.only = T)) {
# 
# #  if package is not existing, install then load the package
# install.packages(i, dependencies = T)
# require(i, character.only = T)
# }
# }


## save plots?
save <- TRUE
#save <- FALSE

## quality of png's
dpi <- 750

## font adjust; please adjust to client´s website
#extrafont::loadfonts(device = "win", quiet = TRUE)
#font_add_google("Montserrat", "Montserrat")
# font_add_google("Overpass", "Overpass")
# font_add_google("Overpass Mono", "Overpass Mono")



## theme updates; please adjust to client´s website
#theme_set(ggthemes::theme_clean(base_size = 15))
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))


theme_update(plot.margin = margin(30, 30, 30, 30),
             plot.background = element_rect(color = "white",
                                            fill = "white"),
             plot.title = element_text(size = 20,
                                       face = "bold",
                                       lineheight = 1.05,
                                       hjust = .5,
                                       margin = margin(10, 0, 25, 0)),
             plot.title.position = "plot",
             plot.caption = element_text(color = "grey40",
                                         size = 9,
                                         margin = margin(20, 0, -20, 0)),
             plot.caption.position = "plot",
             axis.line.x = element_line(color = "black",
                                        size = .8),
             axis.line.y = element_line(color = "black",
                                        size = .8),
             axis.title.x = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(t = 20)),
             axis.title.y = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(r = 20)),
             axis.text = element_text(size = 11,
                                      color = "black",
                                      face = "bold"),
             axis.text.x = element_text(margin = margin(t = 10)),
             axis.text.y = element_text(margin = margin(r = 10)),
             axis.ticks = element_blank(),
             panel.grid.major.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.major.y = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.y = element_blank(),
             panel.spacing.x = unit(4, "lines"),
             panel.spacing.y = unit(2, "lines"),
             legend.position = "top",
             legend.title = element_text(family = "Montserrat",
                                         color = "black",
                                         size = 14,
                                         margin = margin(5, 0, 5, 0)),
             legend.text = element_text(family = "Montserrat",
                                        color = "black",
                                        size = 11,
                                        margin = margin(4.5, 4.5, 4.5, 4.5)),
             legend.background = element_rect(fill = NA,
                                              color = NA),
             legend.key = element_rect(color = NA, fill = NA),
             #legend.key.width = unit(5, "lines"),
             #legend.spacing.x = unit(.05, "pt"),
             #legend.spacing.y = unit(.55, "pt"),
             #legend.margin = margin(0, 0, 10, 0),
             strip.text = element_text(face = "bold",
                                       margin = margin(b = 10)))

## theme settings for flipped plots
theme_flip <-
  theme(panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_line(size = .6,
                                          color = "#eaeaea"))

## theme settings for maps
theme_map <- 
  theme_void(base_family = "Montserrat") +
  theme(legend.direction = "horizontal",
        legend.box = "horizontal",
        legend.margin = margin(10, 10, 10, 10),
        legend.title = element_text(size = 17, 
                                    face = "bold"),
        legend.text = element_text(color = "grey33",
                                   size = 12),
        plot.margin = margin(15, 5, 15, 5),
        plot.title = element_text(face = "bold",
                                  size = 20,
                                  hjust = .5,
                                  margin = margin(30, 0, 10, 0)),
        plot.subtitle = element_text(face = "bold",
                                     color = "grey33",
                                     size = 17,
                                     hjust = .5,
                                     margin = margin(10, 0, -30, 0)),
        plot.caption = element_text(size = 14,
                                    color = "grey33",
                                    hjust = .97,
                                    margin = margin(-30, 0, 0, 0)))

## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)

## main color backlinko
bl_col <- "#00d188"
bl_dark <- darken(bl_col, .3, space = "HLS")

## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")

## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", darken(bl_col, .1))

## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)

Load data

youtube_data <- read_csv(here("proc_data","youtube_data_proc.csv"))
youtube_data_activities <- read_csv(here("proc_data","youtube_data_activities_proc.csv"))
tiktok_data <-  read_csv(here("proc_data","tiktok_data_proc.csv"))
tiktok_data_activities <-  read_csv(here("proc_data","tiktok_data_activities_proc.csv"))

Calculate general stats

yt_vids <- youtube_data %>% distinct(yt_video_id) %>% nrow()
tt_vids <- tiktok_data %>% distinct(tt_video_id) %>% nrow()
youtube_data_activities %>% group_by(yt_video_id) %>% summarise(idn=max(idea)) %>% 
   pull(idn) %>% {length(which(.>1))}-> mult_ideas_yt
tiktok_data_activities %>% group_by(tt_video_id) %>% summarise(idn=max(idea)) %>% 
   pull(idn) %>% {length(which(.>1))} -> mult_ideas_tt

meanytlength <- youtube_data$video_length %>% summary %>% {./60}
meanttlength <- tiktok_data$video_meta_duration %>% summary 

General stats

YouTube: 177 videos (unique video url identifiers, includes YT shorts)
TikTok: 177 videos

YouTube videos are longer (12.15141 minutes on average for the sampled videos), so approximately one third of the videos examined (53/177) included >1 money-making idea. TikTok videos have a shorter maximum length (3 to 10 minutes; 41.14384 seconds on average for the sampled videos) so videos on this platform tend to feature a single idea. Only 4 of the 145 TikTok videos examined provided more than one money-making idea.

Publication dates

youtube_data <- youtube_data %>% mutate(month=month(ymd(youtube_data$publish_date)),
                        pyear=year(ymd(youtube_data$publish_date))) %>% 
  mutate(pub_date=ymd(publish_date))


tiktok_data <- tiktok_data %>% mutate(month=month(ymd_hms(tiktok_data$create_time_iso)),                                pyear=year(ymd_hms(tiktok_data$create_time_iso))) %>%
  mutate(pub_date=date(ymd_hms(create_time_iso))) 

3/4 of the YouTube videos examined were published in 2022, and across all the videos sampled (published since 2018), most are from the summer/fall season (Northern Hemisphere).

TikTok videos in the sample were published between 2019-2022, with more videos uploaded with each passing year. The month with most uploads is July.

tiktok_data %>% tabyl(pyear) %>% round(2)
##  pyear  n percent
##   2019  4    0.03
##   2020 29    0.20
##   2021 50    0.34
##   2022 63    0.43

Publication month also varied between platforms.

youtube_data %>% count(month) %>% 
  ggplot()+
  geom_bar(aes(x=month,y=n),stat = "identity")+
  scale_x_discrete(limits=month.abb) +labs(subtitle = "YouTube data")

tiktok_data %>% count(month) %>% 
  ggplot()+
  geom_bar(aes(x=month,y=n),stat = "identity")+
  scale_x_discrete(limits=month.abb) +labs(subtitle = "TikTok data")

Considering publication dates, videos published earlier do not tend to accumulate more views and comments over time. Engagement is also mostly unrelated to subscriber/follower counts and thus possibly related to content.

ttdatevc <- tiktok_data %>% select(source,pub_date,
                       comments=comment_count,
                       views=play_count,
                       followers=author_meta_fans)
ytdatevc <- youtube_data %>% select(source,pub_date,
                       comments=comments,
                       views=view_count,
                       followers=subs_numeric)
dates_views_comments <- bind_rows(ttdatevc,ytdatevc)

ggplot(dates_views_comments)+
  geom_point(aes(x=pub_date,y=views,color=source))+
  labs(x="Publication date")

ggplot(dates_views_comments)+
  geom_point(aes(x=pub_date,y=comments,color=source))+
    labs(x="Publication date")

ggplot(dates_views_comments)+
  geom_point(aes(x=views,y=comments,color=source))

ggplot(dates_views_comments)+
  geom_point(aes(x=followers,y=comments,color=source))

dates_views_comments %>% filter(followers!=44100000) %>% 
ggplot()+
  geom_point(aes(x=followers,y=comments,color=source))+
  labs(subtitle = "removed outlier")

ggplot(dates_views_comments)+
  geom_point(aes(x=followers,y=views,color=source))

dates_views_comments %>% filter(followers!=44100000) %>% 
ggplot()+
  geom_point(aes(x=followers,y=views,color=source))+
  labs(subtitle = "removed outlier")

Presenter demographics

yt_presenter_demog_gend <- youtube_data %>% tabyl(presenter_gender) %>% 
  mutate(valid_percent=round(valid_percent,2))
yt_malepct <- yt_presenter_demog_gend$valid_percent[2] 
tt_presenter_demog_gend <- tiktok_data %>% tabyl(presenter_gender) %>% 
  mutate(valid_percent=round(valid_percent,2))
tt_malepct <- tt_presenter_demog_gend$valid_percent[2] 
yt_ages <- youtube_data %>% tabyl(presenter_age) %>% na.omit() %>% select(-percent) %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent)
tt_ages <- tiktok_data %>% tabyl(presenter_age) %>% na.omit() %>% select(-percent) %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent)

Male presenters were represented more on both platforms (YouTube: 0.86% and TikTok 0.8%), and the 20-30 y.o. age category had the highest proportion with ~40%.

gt(yt_ages) %>% tab_header("YouTube")
YouTube
presenter_age n percent
10 - 20 6 0.03
20 - 30 76 0.43
30 - 40 49 0.28
40 - 50 4 0.02
50+ 3 0.02
Voice-over 20 0.11
Voice-over Text-to-Speech 18 0.10
gt(tt_ages) %>% tab_header("TikTok")
TikTok
presenter_age n percent
10 - 20 14 0.10
20 - 30 57 0.40
30 - 40 24 0.17
40 - 50 5 0.04
50+ 2 0.01
Music 8 0.06
Voice-over 6 0.04
Voice-over Text-to-Speech 25 0.18

Categories

YouTube videos, as categorized by their authors, varied in assignment despite the similar overarching topic.

The most common category was Education, followed by How-to % Stlye, and then all the others.

youtube_data %>% tabyl(category) %>% arrange(-n) %>% 
  mutate(across(where(is.numeric),round,2)) %>% gt() %>% tab_header(title = "YouTube data")
YouTube data
category n percent
Education 96 0.54
Howto & Style 46 0.26
People & Blogs 27 0.15
Entertainment 7 0.04
News & Politics 1 0.01

Earnings data

ytearn <- 
youtube_data_activities %>% 
  group_by(yt_video_id,idea,earnings_timeframe) %>% 
  summarise(earn=mean(earnings,na.rm=TRUE)) %>% ungroup() %>% na.omit() %>% 
  filter(earnings_timeframe!="No timeframe provided")
ttearn <- 
  tiktok_data_activities %>% 
  group_by(tt_video_id,idea,earnings_timeframe) %>% 
  summarise(earn=mean(earnings,na.rm=TRUE)) %>% ungroup() %>% na.omit() %>% 
  filter(earnings_timeframe!="No timeframe provided")

# earnings time frames
earn_tf <- bind_rows(ytearn,ttearn) %>% count(earnings_timeframe) %>% arrange(-n)

earnings_by_tf <- 
bind_rows(ytearn,ttearn) %>% group_by(earnings_timeframe) %>% 
  summarize(median_earn=median(earn),
            min_earn=min(earn),max_earn=max(earn),
            sd_earn=sd(earn,na.rm = TRUE)) %>% arrange(-median_earn)
  • The most common time frame for earnings was for daily income, followed by months and hours.
gt(earn_tf)
earnings_timeframe n
Days 69
Months 66
Hours 56
One-time earnings 31
Minutes 21
Weeks 9
Years 5
Per Post 2
  • Longer time frames report higher median earnings.
gt(earnings_by_tf)
earnings_timeframe median_earn min_earn max_earn sd_earn
Years 100000.000 500.000 400000.00 155449.21533
Months 5375.000 15.000 300000.00 50663.54170
Weeks 1050.000 24.000 14000.00 4357.91659
Days 500.000 5.000 7000.00 1324.43451
Hours 40.000 3.000 487.85 109.77971
One-time earnings 30.000 1.000 1225.00 271.99917
Minutes 26.000 0.042 400.00 152.51315
Per Post 16.895 0.500 33.29 23.18603

Standardized earnings

temporal_earn <- c("Days","Hours","Minutes","Months","Weeks","Years")

yt_tempearn <- ytearn %>% filter(earnings_timeframe %in% temporal_earn)
tt_tempearn <- ttearn %>% filter(earnings_timeframe %in% temporal_earn)

yt_hourly_earn <- 
yt_tempearn %>% mutate(hourly_earn=case_when(
  earnings_timeframe=="Hours"~earn,
  earnings_timeframe=="Minutes"~earn/60,
  earnings_timeframe=="Days"~earn/8,
  earnings_timeframe=="Weeks"~earn/40,
  earnings_timeframe=="Months"~earn/200,
  earnings_timeframe=="Years"~earn/2400
)) %>% mutate(source="YouTube")

tt_hourly_earn <- 
  tt_tempearn %>% mutate(hourly_earn=case_when(
    earnings_timeframe=="Hours"~earn,
    earnings_timeframe=="Minutes"~earn/60,
    earnings_timeframe=="Days"~earn/8,
    earnings_timeframe=="Weeks"~earn/40,
    earnings_timeframe=="Months"~earn/200,
    earnings_timeframe=="Years"~earn/2400
  )) %>% mutate(source="TikTok")

all_earn <- bind_rows(yt_hourly_earn,tt_hourly_earn) 
hourly_med <- median(all_earn$hourly_earn)


bind_rows(yt_hourly_earn,tt_hourly_earn) %>% 
  ggplot()+
  geom_histogram(aes(hourly_earn,fill=source),color="black",alpha=0.5)

For videos that report earnings associated with a temporal reference ($ earned per unit of time), earnings can be reported in a common unit by assuming 8 hour work days and 5 day work weeks. The median hourly earnings is 37.5.

Across all videos, earnings are right-skewed. 90% of videos report hourly earnings > 275.

This distribution is also evident within earnings timeframes.

bind_rows(yt_hourly_earn,tt_hourly_earn) %>% 
  ggplot()+
  geom_histogram(aes(earn))+
  facet_wrap(~earnings_timeframe,scales = 'free')

Earnings by category (YouTube)

The more common categories (Education, Howto & Style) did not report the higher mean or median standardized earnings. Instead, the People and Blogs category and Entertainment had the top two positions.

yt_hourlycorrs <- left_join(yt_hourly_earn,youtube_data_activities)

yt_hourlycorrs_chp <- yt_hourlycorrs  %>% group_by(yt_video_id,idea) %>%
  chop(business_type_level_1) %>% chop(business_type_level_2) %>% chop(skills_required) %>% ungroup()
yt_hourlycorrs_chp %>% 
  group_by(category) %>% summarise(mean_earn=mean(hourly_earn),
                                   med_earn=median(hourly_earn)) %>% 
  arrange(-mean_earn) %>% gt()
category mean_earn med_earn
People & Blogs 206.80750 75.00
Entertainment 127.05000 125.00
Howto & Style 90.84293 43.75
Education 68.85482 37.50
yt_hourlycorrs_chp %>% 
  ggplot(aes(x=category,y=hourly_earn,color=category))+
  geom_sina() + scale_color_discrete(guide="none")

Business types and activities

YouTube - Level 1

For all YouTube videos, the predominant Business Type for the money-making ideas was Publication, Media, and Blogs, followed by the Service Business. Other business types were less common.

# without earnings
yt_acts_chp <- youtube_data_activities  %>% group_by(yt_video_id,idea) %>%
  chop(business_type_level_1) %>% chop(business_type_level_2) %>% chop(skills_required) %>% ungroup()

yt_bus1 <- youtube_data_activities  %>% group_by(yt_video_id,idea) %>%
  unchop(business_type_level_1) %>%  ungroup()

bus1ct <- yt_bus1 %>% group_by(yt_video_id,idea) %>% 
  distinct(yt_video_id,idea,business_type_level_1) %>% 
  tabyl(business_type_level_1) %>% arrange(-n)

bus1ct %>% na.omit() %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent) %>% 
  gt() %>% tab_header("YouTube",subtitle = "Business Types, all videos") 
YouTube
Business Types, all videos
business_type_level_1 n percent
Publication, Media & Blog 175 0.45
Service Business 122 0.31
Ecommerce & Consumer 56 0.14
Investing 27 0.07
Software & Tech 9 0.02


For videos and ideas with reported earnings, the business activity with the highest earnings (standardized) was Publication, Media, & Blog, followed by investing.

# with earninings

yt_hourlycorrs_bus1 <- yt_hourlycorrs  %>% group_by(yt_video_id,idea) %>%
unchop(business_type_level_1) %>%  ungroup()

yt_hourlycorrs_bus1 %>% group_by(yt_video_id,idea) %>% 
  distinct(yt_video_id,idea,business_type_level_1,hourly_earn) %>% ungroup() %>% 
  group_by(business_type_level_1) %>% 
  summarise(mean_earn=mean(hourly_earn),
            median_earn=median(hourly_earn)) %>% 
  arrange(-median_earn) %>% gt() %>% tab_header("YouTube",
                                                subtitle = "standardized hourly earning by business types")
YouTube
standardized hourly earning by business types
business_type_level_1 mean_earn median_earn
Publication, Media & Blog 123.72364 62.50000
Investing 31.20833 26.25000
Software & Tech 20.00000 20.00000
Service Business 35.67339 17.50000
Ecommerce & Consumer 48.28748 16.77083


However, there is considerable variation in earnings across the different business types

yt_hourlycorrs_bus1 %>% group_by(yt_video_id,idea) %>% 
  distinct(yt_video_id,idea,business_type_level_1,hourly_earn) %>% ungroup %>% 
  ggplot()+
  geom_sina(aes(x=str_wrap(business_type_level_1,12),
                  y=hourly_earn,color=business_type_level_1))+
  scale_color_discrete(guide="none")+labs(x="Business Type (level 1)")

YouTube - Level 2

yt_bus2 <- youtube_data_activities  %>% group_by(yt_video_id,idea) %>%
  unchop(business_type_level_2) %>%  ungroup()
n_bus2 <- youtube_data_activities %>% distinct(business_type_level_2) %>% nrow()
maxnbus2 <- yt_bus2 %>% group_by(yt_video_id,idea) %>% 
    distinct(yt_video_id,idea,business_type_level_2) %>% ungroup() %>% group_by(yt_video_id,idea) %>% summarise(nbus2=n()) %>% arrange(-nbus2) %>% pull(nbus2) %>% max()

The second-level classification of Business Activities for making money includes many more categories (51). Many combinations of Business Types were possible for each video/idea, but none included more than 3.

At this level no particular business type predominated, none represented >20% of suggested activities. The most frequent business type was Publication, Media & Blog - Affiliate Marketing, followed by Service Business - Microwork/tasks (e.g. Taking surveys, searching for data, tagging content and identifying images). Other types were much less common.

bus2ct <- yt_bus2 %>% group_by(yt_video_id,idea) %>% 
  distinct(yt_video_id,idea,business_type_level_2) %>% 
  tabyl(business_type_level_2) %>% arrange(-n)

bus2ct %>% na.omit() %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent) %>% filter(n>1) %>% 
  gt() %>% tab_header("YouTube",subtitle = "Business Types (LEVEL 2), all videos") %>% tab_footnote(footnote = "n=1 not shown",
                           locations = cells_column_labels(
                             columns = n
                           )) 
YouTube
Business Types (LEVEL 2), all videos
business_type_level_2 n1 percent
Publication, Media & Blog - Affiliate Marketing 86 0.19
Service Business - Microwork/tasks (e.g. Taking surveys, searching for data, tagging content and identifying images) 64 0.14
Publication, Media & Blog - YouTube 35 0.08
Service Business - Other freelance (e.g. on Upwork, Fiverr) 25 0.06
Service Business - Agency (e.g. Social Media Marketing, SEO, etc.) 19 0.04
Ecommerce & Consumer - Online shop on marketplaces (e.g. Ebay, Etsy) 18 0.04
Publication, Media & Blog - Courses 17 0.04
Ecommerce & Consumer - Dropshipping 14 0.03
Ecommerce & Consumer - Online Shop 14 0.03
Investing - Crypto 13 0.03
Publication, Media & Blog - Write a blog 11 0.02
Ecommerce & Consumer - Amazon FBA 10 0.02
Investing - Stocks 10 0.02
Ecommerce & Consumer - Sell belongings on marketplaces (e.g. Ebay, Craigslist, Etsy) 9 0.02
Investing - Real estate investing (e.g. House flipping/ Crowdfunding) 8 0.02
Publication, Media & Blog - Selling Digital Products (EBooks/Print On Demand etc)) 8 0.02
Publication, Media & Blog - Content Creator 7 0.02
Publication, Media & Blog - Influencer 5 0.01
Publication, Media & Blog - Youtube/Rumble/Pinterest Video Automation 5 0.01
Service Business - Become a Virtual Assistant 5 0.01
Software & Tech - Create A Website 5 0.01
Service Business - Freelance Writer 4 0.01
Ecommerce & Consumer - Facebook Marketplace 3 0.01
Publication, Media & Blog - Paid Community (Patreon) 3 0.01
Publication, Media & Blog - Spinning Articles 3 0.01
Publication, Media & Blog - Youtube Sponsorships 3 0.01
Service Business - Rent out stuff (e.g. storage space, truck) 3 0.01
Publication, Media & Blog - Image Sharing 2 0.00
Publication, Media & Blog - Membership Sites 2 0.00
Publication, Media & Blog - Newsletter/ Articles 2 0.00
Publication, Media & Blog - NFT's 2 0.00
Publication, Media & Blog - Podcasting 2 0.00
Publication, Media & Blog - Write a book 2 0.00
Service Business - Home Delivery Services 2 0.00
Service Business - Home Services (e.g. Power Washing, Pet sitting) 2 0.00
Service Business - Video Editor 2 0.00
Software & Tech - Create a mobile app 2 0.00
Software & Tech - Create a software 2 0.00
Software & Tech - Create Templates 2 0.00
1 n=1 not shown

The Business type with the highest mean standardized earnings was Publication, Media & Blog - Newsletter/ Articles, followed by Ecommerce & Consumer - Dropshipping and other types in the Publication/Media/Blogging fields.

yt_hourlycorrs_bus2 <- yt_hourlycorrs  %>% group_by(yt_video_id,idea) %>%
unchop(business_type_level_2) %>%  ungroup()
yt_hourlycorrs_bus2 %>% group_by(yt_video_id,idea) %>% 
  distinct(yt_video_id,idea,business_type_level_2,hourly_earn) %>% ungroup() %>% 
  group_by(business_type_level_2) %>% 
  summarise(mean_earn=mean(hourly_earn),
            median_earn=median(hourly_earn)) %>% 
  arrange(-median_earn) %>% gt() %>% tab_header("YouTube, Business Type Level 2",
                                                subtitle = "standardized hourly earning by business types")
YouTube, Business Type Level 2
standardized hourly earning by business types
business_type_level_2 mean_earn median_earn
Publication, Media & Blog - Newsletter/ Articles 312.50000 312.50000
Ecommerce & Consumer - Dropshipping 255.56820 255.56820
Publication, Media & Blog - Youtube Sponsorships 230.62500 230.62500
Publication, Media & Blog - Spinning Articles 118.50000 175.00000
Service Business - Sales Representative/Advisor 170.00000 170.00000
Publication, Media & Blog - Youtube/Rumble/Pinterest Video Automation 105.00000 105.00000
Publication, Media & Blog - Affiliate Marketing 151.74915 81.25000
Publication, Media & Blog - YouTube 118.58227 75.00000
Publication, Media & Blog - Membership Sites 70.00000 70.00000
Service Business - Rent out stuff (e.g. storage space, truck) 68.75000 68.75000
Investing - Crypto 62.50000 62.50000
Publication, Media & Blog - Content Creator 50.00000 50.00000
Software & Tech - Create A Website 47.50000 47.50000
Service Business - Home Services (e.g. Power Washing, Pet sitting) 45.62500 45.62500
Service Business - Home Delivery Services 42.25000 42.25000
Service Business - Agency (e.g. Social Media Marketing, SEO, etc.) 47.08333 37.50000
Service Business - Become a Virtual Assistant 40.62500 35.00000
Investing - Real estate investing (e.g. House flipping/ Crowdfunding) 31.35417 31.35417
Service Business - Freelance Writer 40.50000 30.00000
Publication, Media & Blog - NFT's 26.25000 26.25000
Publication, Media & Blog - Selling Digital Products (EBooks/Print On Demand etc)) 28.43240 25.59812
Publication, Media & Blog - Write a blog 25.25000 25.25000
Publication, Media & Blog - Creating Spotify Ads 25.00000 25.00000
Publication, Media & Blog - Courses 71.15000 20.00000
Ecommerce & Consumer - Online shop on marketplaces (e.g. Ebay, Etsy) 20.84028 16.77083
Service Business - Other freelance (e.g. on Upwork, Fiverr) 24.62292 15.00000
Service Business - Video Editor 15.00000 15.00000
Publication, Media & Blog - Paid Community (Patreon) 27.30518 10.00000
Service Business - Data Entry 10.00000 10.00000
Ecommerce & Consumer - Online Shop 9.09500 9.09500
Investing - Stocks 22.33333 4.00000
Service Business - Microwork/tasks (e.g. Taking surveys, searching for data, tagging content and identifying images) 30.40132 3.00000

The five business types at this level with n>6 show a wide range of earnings.

yt_hourlycorrs_bus2 %>% group_by(yt_video_id,idea) %>% 
  distinct(yt_video_id,idea,business_type_level_2,hourly_earn) %>% ungroup %>% add_count(business_type_level_2) %>% filter(n>6) %>% 
  ggplot()+
  geom_sina(aes(x=str_wrap(business_type_level_2,33),
                  y=hourly_earn,color=business_type_level_2))+
  scale_color_discrete(guide="none")+labs(x="Business Type (level 1)")+
  coord_flip()+labs(x="Business Type Level 2")

TikTok - Level 1

For TikTok videos, the predominant Business Type for the money-making ideas was Service Business with almost 50% of videos, followed by the Ecommerce & Consumer ventures. Other business types were less common.

# tt without earnings
tt_acts_chp <- tiktok_data_activities  %>% group_by(tt_video_id,idea) %>%
  chop(business_type_level_1) %>% chop(business_type_level_2) %>% chop(skills_required) %>% ungroup()

tt_bus1 <- tiktok_data_activities  %>% group_by(tt_video_id,idea) %>%
  unchop(business_type_level_1) %>%  ungroup()

tt_bus1ct <- tt_bus1 %>% group_by(tt_video_id,idea) %>% 
  distinct(tt_video_id,idea,business_type_level_1)  %>% 
  tabyl(business_type_level_1) %>% arrange(-n)


tt_bus1ct %>% na.omit() %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent) %>% 
  gt() %>% tab_header("TikTok",subtitle = "Business Types, all videos") 
TikTok
Business Types, all videos
business_type_level_1 n percent
Service Business 74 0.49
Ecommerce & Consumer 36 0.24
Publication, Media & Blog 28 0.19
Investing 12 0.08
Software & Tech 1 0.01

For videos and ideas with reported earnings, the business activity with the highest earnings (standardized) was Investing, followed by Ecommerce & Consumer

# tt with earninings
tt_hourlycorrs <- left_join(tt_hourly_earn,tiktok_data_activities)

tt_hourlycorrs_chp <- tt_hourlycorrs  %>% group_by(tt_video_id,idea) %>%
  chop(business_type_level_1) %>% chop(business_type_level_2) %>% chop(skills_required) %>% ungroup()

tt_hourlycorrs_bus1 <- tt_hourlycorrs  %>% group_by(tt_video_id,idea) %>%
  unchop(business_type_level_1) %>%  ungroup()

tt_hourlycorrs_bus1 %>% group_by(tt_video_id,idea) %>% 
  distinct(tt_video_id,idea,business_type_level_1,hourly_earn) %>% ungroup() %>% 
  group_by(business_type_level_1) %>% 
  summarise(mean_earn=mean(hourly_earn),
            median_earn=median(hourly_earn)) %>% 
  arrange(-median_earn) %>% gt()
business_type_level_1 mean_earn median_earn
Investing 230.37097 135.0000
Ecommerce & Consumer 215.47273 109.9006
Publication, Media & Blog 221.63810 31.2500
Service Business 58.37537 25.0000
Software & Tech 1.48500 1.4850

However, with some exceptions, earnings do not vary considerably across the different business types

tt_hourlycorrs_bus1 %>% group_by(tt_video_id,idea) %>% 
  distinct(tt_video_id,idea,business_type_level_1,hourly_earn) %>% ungroup %>% 
  ggplot()+
  geom_jitter(aes(x=str_wrap(business_type_level_1,12),
                  y=hourly_earn,color=business_type_level_1))+
  scale_color_discrete(guide="none")+labs(x="Business Type (level 1)",subtitle = "TikTok")

# 1 saving plots in pdf with example

# ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color= drv)) + 
#      geom_smooth(mapping = aes(linetype = drv), method = 'loess') +
#      geom_point()
# 
# if(save == T){ 
#   ggsave(here::here("plots", "name_plot.pdf"), 
#          width = 12.5, height = 8, device = cairo_pdf)
# }

# 2 pdfs will then be converted into the pngs using the 04_convert_pdfs_to_pngs.rmd file. 

View counts, comments, followers, and standardized earnings are not tightly associated.

yt_hourly_renamed <-  yt_hourlycorrs_chp %>%  
  select(earn=hourly_earn,views=view_count,source,comments,followers=subs_numeric)
  
tt_hourly_renamed <-  tt_hourlycorrs_chp %>%  
  select(earn=hourly_earn,views=play_count,source,comments=comment_count,
           followers=author_meta_fans)

hourlyboth <- bind_rows(yt_hourly_renamed,tt_hourly_renamed)

ggplot(hourlyboth)+aes(x=views,y=earn,color=source)+geom_point()

ggplot(hourlyboth)+aes(x=comments,y=earn,color=source)+geom_point()

ggplot(hourlyboth)+aes(x=followers,y=earn,color=source)+geom_point()

TikTok - Level 2

n_bus2tt <- tiktok_data_activities %>% distinct(business_type_level_2) %>% nrow()

tt_bus2 <- tiktok_data_activities  %>% group_by(tt_video_id,idea) %>%
  unchop(business_type_level_2) %>%  ungroup()
maxnbus2tt <- tt_bus2 %>% group_by(tt_video_id,idea) %>% 
    distinct(tt_video_id,idea,business_type_level_2) %>% ungroup() %>% group_by(tt_video_id,idea) %>% summarise(nbus2=n()) %>% arrange(-nbus2) %>% pull(nbus2) %>% max()

TikTok videos mentioned fewer (33) Business Activities in this level than Youtube videos. Many combinations of Business Types were possible for each video/idea, but none included more than 3.

No particular business type predominated, none represented >20% of suggested activities. The most frequent business type was Service Business - Microwork/tasks (e.g. Taking surveys, searching for data, tagging content and identifying images), followed by Ecommerce & Consumer - Online shop on marketplaces (e.g. Ebay, Etsy). Other types were much less common.

bus2ctt <- tt_bus2 %>% group_by(tt_video_id,idea) %>% 
  distinct(tt_video_id,idea,business_type_level_2) %>% 
  tabyl(business_type_level_2) %>% arrange(-n)

bus2ctt %>% na.omit() %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent) %>% filter(n>1) %>% 
  gt() %>% tab_header("TikTok",subtitle = "Business Types (LEVEL 2), all videos") %>% tab_footnote(footnote = "n=1 not shown",
                           locations = cells_column_labels(
                             columns = n
                           )) 
TikTok
Business Types (LEVEL 2), all videos
business_type_level_2 n1 percent
Service Business - Microwork/tasks (e.g. Taking surveys, searching for data, tagging content and identifying images) 30 0.16
Ecommerce & Consumer - Online shop on marketplaces (e.g. Ebay, Etsy) 16 0.09
Ecommerce & Consumer - Dropshipping 14 0.08
Service Business - Agency (e.g. Social Media Marketing, SEO, etc.) 10 0.05
Publication, Media & Blog - Affiliate Marketing 9 0.05
Service Business - Other freelance (e.g. on Upwork, Fiverr) 9 0.05
Ecommerce & Consumer - Amazon FBA 8 0.04
Investing - Real estate investing (e.g. House flipping/ Crowdfunding) 8 0.04
Ecommerce & Consumer - Online Shop 7 0.04
Publication, Media & Blog - Selling Digital Products (EBooks/Print On Demand etc)) 7 0.04
Publication, Media & Blog - YouTube 6 0.03
Service Business - Sales Representative/Advisor 5 0.03
Service Business - Vending Machine 5 0.03
Ecommerce & Consumer - Facebook Marketplace 4 0.02
Publication, Media & Blog - Image Sharing 4 0.02
Publication, Media & Blog - Youtube/Rumble/Pinterest Video Automation 4 0.02
Service Business - Become a Virtual Assistant 4 0.02
Service Business - Home Delivery Services 4 0.02
Service Business - Home Services (e.g. Power Washing, Pet sitting) 4 0.02
Ecommerce & Consumer - Sell belongings on marketplaces (e.g. Ebay, Craigslist, Etsy) 3 0.02
Investing - Stocks 3 0.02
Publication, Media & Blog - NFT's 3 0.02
Service Business - Freelance Writer 3 0.02
Service Business - Rent out room (e.g. Airbnb) 3 0.02
Service Business - Car Wash 2 0.01
Service Business - Furniture Flipping 2 0.01
Service Business - Rent out stuff (e.g. storage space, truck) 2 0.01
1 n=1 not shown

The Business type with the highest mean standardized earnings was Publication, Media & Blog - Write a blog (although this result is driven by a single video stating $300000 in monthly earnings through this method). After that Service Business - Furniture Flipping and Investing - Stocks have similar earnings, followed by various others.

tt_hourlycorrs_bus2 <- tt_hourlycorrs  %>% group_by(tt_video_id,idea) %>%
unchop(business_type_level_2) %>%  ungroup()

tt_hourlycorrs_bus2 %>% group_by(tt_video_id,idea) %>% 
  distinct(tt_video_id,idea,business_type_level_2,hourly_earn) %>% ungroup() %>% 
  group_by(business_type_level_2) %>% 
  summarise(mean_earn=mean(hourly_earn),
            median_earn=median(hourly_earn)) %>% 
  arrange(-median_earn) %>% gt() %>% tab_header("TikTok, Business Type Level 2",
                                                subtitle = "standardized hourly earning by business types")
TikTok, Business Type Level 2
standardized hourly earning by business types
business_type_level_2 mean_earn median_earn
Publication, Media & Blog - Write a blog 1500.00000 1500.00000
Service Business - Furniture Flipping 281.25000 281.25000
Investing - Stocks 278.05646 278.05646
Investing - Real estate investing (e.g. House flipping/ Crowdfunding) 135.00000 135.00000
Ecommerce & Consumer - Amazon FBA 195.74375 119.80125
Ecommerce & Consumer - Online Shop 100.00000 100.00000
Ecommerce & Consumer - Online shop on marketplaces (e.g. Ebay, Etsy) 217.68778 75.00000
Service Business - Agency (e.g. Social Media Marketing, SEO, etc.) 73.81944 75.00000
Service Business - Other freelance (e.g. on Upwork, Fiverr) 79.88095 75.00000
Service Business - Home Delivery Services 63.48438 63.48438
Ecommerce & Consumer - Dropshipping 184.46576 50.00000
Service Business - Car Wash 50.00000 50.00000
Service Business - Home Services (e.g. Power Washing, Pet sitting) 45.00000 40.00000
Publication, Media & Blog - Affiliate Marketing 281.49048 37.50000
Publication, Media & Blog - YouTube 501.50000 37.50000
Publication, Media & Blog - Youtube/Rumble/Pinterest Video Automation 323.33333 37.50000
Service Business - Freelance Writer 68.05556 30.00000
Ecommerce & Consumer - Sell belongings on marketplaces (e.g. Ebay, Craigslist, Etsy) 23.73000 23.73000
Service Business - Become a Virtual Assistant 29.99500 21.98500
Publication, Media & Blog - Selling Digital Products (EBooks/Print On Demand etc)) 37.50000 18.75000
Service Business - Rent out room (e.g. Airbnb) 18.31750 18.31750
Service Business - Microwork/tasks (e.g. Taking surveys, searching for data, tagging content and identifying images) 64.58271 18.00000
Service Business - Sales Representative/Advisor 18.74000 18.00000
Software & Tech - Create a software 1.48500 1.48500
Service Business - Vending Machine 0.71625 0.71625

The six business types at this level with n>5 also show a wide range of earnings.

tt_hourlycorrs_bus2 %>% group_by(tt_video_id,idea) %>% 
  distinct(tt_video_id,idea,business_type_level_2,hourly_earn) %>% ungroup %>% add_count(business_type_level_2) %>% filter(n>5) %>% 
  ggplot()+
  geom_sina(aes(x=str_wrap(business_type_level_2,33),
                  y=hourly_earn,color=business_type_level_2))+
  scale_color_discrete(guide="none")+labs(x="Business Type (level 1)")+
  coord_flip()+labs(x="Business Type Level 2")

Skills

Different money-making ideas on both platforms varied in the number of skills needed to generate earnings. For the most part, each idea needed only one or two different skills, and this was more evident on TikTok (much shorter videos with generally only one money-making idea).

skills_per_idea_yt <- 
youtube_data_activities %>% select(yt_video_id,idea,skills_required) %>% 
  group_by(yt_video_id,idea) %>%  distinct() %>% 
summarise(n_skills=n()) %>% ungroup() %>% mutate(source="YouTube") %>% 
    select(n_skills,source)

skills_per_idea_tt <- 
tiktok_data_activities %>% select(tt_video_id,idea,skills_required) %>% 
  group_by(tt_video_id,idea) %>%  distinct() %>% 
summarise(n_skills=n()) %>% ungroup()%>% mutate(source="TikTok") %>% 
  select(n_skills,source)

bind_rows(skills_per_idea_tt,skills_per_idea_yt) %>% 
ggplot()+
  geom_histogram(aes(x=n_skills,fill=source))

The makeup of required skills also varied across platforms.

On YouTube, the most mentioned skill was Marketing followed by Image or Video Editing.

youtube_data_activities %>% select(yt_video_id,idea,skills_required) %>% distinct() %>% tabyl(skills_required) %>% arrange(-n) %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent) %>% na.omit %>% gt() %>% tab_header(title = "YouTube")
YouTube
skills_required n percent
Marketing 152 0.24
Image Editing 83 0.13
Video Editing 81 0.13
Writing 70 0.11
Investing 69 0.11
Doing mircowork for businesses 67 0.11
Speaking 54 0.09
Web Development 29 0.05
Programming 22 0.03
Providing Home Services 3 0.00
Gaming 2 0.00

When an idea or video required two or more skills, the most common combinations were Speaking + Video Editing, Investing + Marketing, Speaking + Writing, and Image + Video Editing.

youtube_data_activities %>% select(yt_video_id,idea,skills_required) %>% 
  group_by(yt_video_id,idea) %>% 
  arrange(yt_video_id, skills_required) %>% 
  summarize(combination = paste0(skills_required, collapse = " - "), .groups = "drop") %>% 
  count(combination) %>% 
  filter(str_detect(combination," - ")) %>%  arrange(-n) %>% slice(1:10) %>% 
  gt() %>% tab_header("YouTube Data", subtitle="Combinations of >2 skills, Top 10 most common combinations shown")
YouTube Data
Combinations of >2 skills, Top 10 most common combinations shown
combination n
Speaking - Video Editing 13
Investing - Marketing 12
Speaking - Writing 12
Image Editing - Video Editing 7
Image Editing - Marketing 6
Image Editing - Marketing - Programming - Video Editing - Web Development - Writing 6
Image Editing - Investing 5
Image Editing - Speaking - Video Editing 4
Image Editing - Speaking - Video Editing - Writing 4
Investing - Investing 4

TikTok videos favored Investing, followed by Marketing and Writing.

tiktok_data_activities %>% group_by(tt_video_id,idea) %>% tabyl(skills_required) %>% arrange(-n) %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent) %>% na.omit %>% gt() %>% tab_header(title = "TikTok")
TikTok
skills_required n percent
Investing 73 0.26
Marketing 57 0.20
Writing 37 0.13
Image Editing 33 0.12
Doing mircowork for businesses 32 0.11
Video Editing 21 0.07
Providing Home Services 9 0.03
Speaking 8 0.03
Web Development 8 0.03
Programming 5 0.02
Gaming 1 0.00

Despite the fewer videos that required >1 skill, there were some frequently mentioned skill combinations such as Investing + Marketing, followed by Marketing+Programming+Video Editing+Web Development+Writing.

tiktok_data_activities %>% select(tt_video_id,idea,skills_required) %>% 
  group_by(tt_video_id,idea) %>% 
  arrange(tt_video_id, skills_required) %>% 
  summarize(combination = paste0(skills_required, collapse = " - "), .groups = "drop") %>% 
  count(combination) %>% 
  filter(str_detect(combination," - ")) %>%  arrange(-n) %>% slice(1:10) %>% 
  gt() %>% tab_header("TikTok Data", subtitle="Combinations of >2 skills, Top 10 most common combinations shown")
TikTok Data
Combinations of >2 skills, Top 10 most common combinations shown
combination n
Investing - Marketing 9
Marketing - Programming - Video Editing - Web Development - Writing 4
Doing mircowork for businesses - Doing mircowork for businesses - Writing - Writing 3
Doing mircowork for businesses - Speaking 3
Image Editing - Image Editing 3
Marketing - Marketing 3
Doing mircowork for businesses - Writing 2
Image Editing - Image Editing - Image Editing - Image Editing - Investing - Investing - Investing - Investing 2
Image Editing - Image Editing - Investing - Investing - Marketing - Marketing 2
Image Editing - Image Editing - Marketing - Marketing 2

Skills and earnings

Across both platforms, the Skill with the highest mean standardized earnings (hourly) was Investing, followed by Marketing, Video Editing, and Writing.

ttskillearn <- 
tt_hourlycorrs %>% select(tt_video_id,idea,hourly_earn,skills_required) %>% group_by(tt_video_id,idea) %>% distinct() %>% 
  group_by(skills_required) %>% 
  summarise(mean_earn=mean(hourly_earn),
            median_earn=median(hourly_earn)) %>% 
  arrange(-mean_earn) 

ytskillearn <- 
yt_hourlycorrs %>% select(yt_video_id,idea,hourly_earn,skills_required) %>% group_by(yt_video_id,idea) %>% distinct() %>% 
  group_by(skills_required) %>% 
  summarise(mean_earn=mean(hourly_earn),
            median_earn=median(hourly_earn)) %>% 
  arrange(-mean_earn)

ytskillearns <- 
yt_hourlycorrs %>% select(yt_video_id,idea,hourly_earn,skills_required) %>% group_by(yt_video_id,idea) %>% distinct() %>% 
  ungroup %>%  mutate(source="YouTube")

ttskillearns <-  
tt_hourlycorrs %>% select(tt_video_id,idea,hourly_earn,skills_required) %>% group_by(tt_video_id,idea) %>% distinct() %>% 
ungroup %>% mutate(source="TikTok")

bothearn <- bind_rows(ytskillearns,ttskillearns) %>% group_by(skills_required) %>% 
  summarise(mean_earn=mean(hourly_earn),
            median_earn=median(hourly_earn)) %>% 
  arrange(-mean_earn)

gt(bothearn) %>% tab_header(title='all platforms',subtitle = "earnings by skill")
all platforms
earnings by skill
skills_required mean_earn median_earn
Investing 156.65162 50.00
Marketing 144.68466 50.00
Video Editing 137.25115 45.00
Writing 85.73030 20.00
Speaking 79.49211 20.00
Providing Home Services 45.30208 40.00
Doing mircowork for businesses 39.19790 11.25
Programming 38.74833 40.00
Web Development 37.03615 37.50
Gaming 36.00000 36.00
Image Editing 35.85725 25.00

By platform, differences in reported earnings appear. On YouTube, Marketing and Investing report the highest mean earnings. On TikTok, Video Editing and Investing report the highest earnings.

gt(ytskillearn)%>% tab_header(title='YouTube',subtitle = "earnings by skill")
YouTube
earnings by skill
skills_required mean_earn median_earn
Marketing 128.49585 62.50
Investing 107.55416 40.00
Video Editing 75.93473 50.00
Writing 63.99792 20.00
Speaking 53.45010 20.00
Web Development 43.12500 38.75
Programming 42.75000 40.00
Doing mircowork for businesses 32.01130 5.02
Providing Home Services 31.91667 25.00
Image Editing 31.00885 18.75
gt(ttskillearn) %>% tab_header(title='TikTok',subtitle = "earnings by skill")
TikTok
earnings by skill
skills_required mean_earn median_earn
Video Editing 364.99786 37.5000
Investing 189.38327 100.0000
Marketing 186.77555 50.0000
Speaking 153.27778 47.5000
Writing 110.56730 20.2200
Image Editing 52.82667 39.7300
Providing Home Services 51.99479 45.0000
Doing mircowork for businesses 49.78868 15.0000
Gaming 36.00000 36.0000
Programming 33.74625 33.4925
Web Development 27.29400 21.9850

Additionally, TikTok videos report significantly higher earnings for the same skills compared with YouTube videos.

ytskearnsummary <- ytskillearn %>% mutate(source="YouTube")
ttskearnsummary <- ttskillearn %>% mutate(source="TikTok")
bind_rows(ytskearnsummary,ttskearnsummary) %>% 
  ggplot()+
  geom_bar(aes(x=fct_reorder(skills_required,mean_earn),y=mean_earn,fill=source),stat="identity",position = "dodge")+coord_flip()+labs(x='skill')

The variation and spread of earnings by skill is consistent across platforms.

bind_rows(ytskillearns,ttskillearns) %>% 
  ggplot(aes(x=fct_reorder(skills_required,hourly_earn),y=hourly_earn,color=source))+
  geom_sina()+labs(x='skill')+
  coord_flip()

More required skills did not relate with greater earnings.

# skills_per_idea_ytid <- 
# youtube_data_activities %>% select(yt_video_id,idea,skills_required) %>% 
#   group_by(yt_video_id,idea) %>%  distinct() %>% 
# summarise(n_skills=n()) %>% ungroup() %>% mutate(source="YouTube")  
# 
# skills_per_idea_ttid <- 
# tiktok_data_activities %>% select(tt_video_id,idea,skills_required) %>% 
#   group_by(tt_video_id,idea) %>%  distinct() %>% 
# summarise(n_skills=n()) %>% ungroup()%>% mutate(source="TikTok")  

skillsearntt <- left_join(skills_per_idea_tt,tt_hourly_earn)
skillsearnyt <- left_join(skills_per_idea_yt,yt_hourly_earn)
bind_rows(skillsearntt,skillsearnyt) %>% 
  ggplot(aes(x=factor(n_skills),hourly_earn))+geom_boxplot()+
  labs(x="number of skills required per money-making idea",y="standardized earnings")

Video titles

In general, the video titles vary considerably across platforms in terms of length, content and style.

tiktok_data <-  tiktok_data %>% mutate(title_noHash=str_extract(text,"^[^#]*")) 
yt_tlength <- round(mean(str_length(youtube_data$title)),0)
tt_tlength <-round(mean(str_length(tiktok_data$text)))
tt_tlength_nh <-round(mean(str_length(tiktok_data$title_noHash)))

Without various trailing hashtags, YouTube video titles are on average, twice as long as TikTok titles (65 vs. 31 characters). Overall, roughly a third of the length of TikTok titles comprises various hashtags.

The words and bigrams (consecutive sequences of two words) that appear most frequently in the video’s titles vary significantly between platforms.

# tokenize 
stopwords <- c("for","in","a","the","to","with","from","by")
title_words_yt <- youtube_data %>% unnest_tokens(title_wrd,title,token = "words") %>% 
  filter(!title_wrd %in% stopwords)
title_bigrams_yt <- youtube_data %>% unnest_tokens(title_bg,title,token = "ngrams",n=2) 
title_words_tt <- tiktok_data %>% unnest_tokens(title_wrd,text,token = "tweets") %>% 
  filter(!title_wrd %in% stopwords)
title_bigrams_tt <- title_words_tt %>% mutate(nextwrdbg=lead(title_wrd)) %>% 
  unite(title_bg, title_wrd, nextwrdbg, sep = ' ')


wordsyt <- title_words_yt %>% count(title_wrd) %>% slice_max(n,n=15) %>% mutate(source="YouTube")
wordstt <- title_words_tt %>% count(title_wrd) %>% slice_max(n,n=15) %>% mutate(source="TikTok")
bg_yt <- title_bigrams_yt %>% count(title_bg) %>% slice_max(n,n=15) %>% mutate(source="YouTube")
bg_tt <- title_bigrams_tt %>% count(title_bg) %>% slice_max(n,n=15) %>% mutate(source="TikTok")
top15wrds <- bind_rows(wordsyt,wordstt)
top15bg <- bind_rows(bg_yt,bg_tt)

ggplot(top15wrds)+
  geom_lollipop(aes(x=fct_reorder(title_wrd,n),y=n))+
  facet_wrap(~source)+labs(x="word or hashtag",y='occurrences')+
  coord_flip()

ggplot(top15bg)+
  geom_lollipop(aes(x=fct_reorder(title_bg,n),y=n))+
  facet_wrap(~source)+labs(x="bigram",y='occurrences')+
  coord_flip()

Considering the top 15 words or bigrams, there is little overlap between platforms.